In [8]:
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
import numpy as np
%matplotlib inline
np.set_printoptions(precision=5, suppress=True)
In [9]:
np.random.seed(42)
def init(n, center):
data = np.random.multivariate_normal(center, [[4, 1], [1, 4]], size=[n,])
return data
data = np.concatenate((init(200, [9, 0]), init(200, [9, 9]), init(200, [0, 9]), init(200, [0, 0])),)
print(data.shape)
plt.scatter(data[:,0], data[:,1])
plt.show()
In [10]:
# plot the kmeans results
def kmeans_plot(data, kmeans):
h = .02
x_min, x_max = data[:, 0].min() - 1, data[:, 0].max() + 1
y_min, y_max = data[:, 1].min() - 1, data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired,
aspect='auto', origin='lower')
plt.plot(data[:, 0], data[:, 1], 'k.', markersize=2)
centroids = kmeans.cluster_centers_ # get the centroids
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='x', s=169, linewidths=3,
color='w', zorder=10)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()
In [11]:
## n_clusters: Number of clusters
## max_iter: Maximum number of iterations of the k-means algorithm for a single run.
## n_init: Number of time the k-means algorithm will be run with different centroid seeds.
## init: How to select initial centers
kmeans = KMeans(n_clusters = 4, max_iter = 100, n_init = 10, init='k-means++', random_state=42)
kmeans.fit(data)
kmeans_plot(data, kmeans)
print('Sum of distances to the centroids: {}'.format(kmeans.inertia_))
In [12]:
init_array = np.asarray([[-10, -10], [-9, -10], [-10, -9], [-9, -9]])
for i in range(10):
#kmeans = KMeans(n_clusters = 4, max_iter = i+1, n_init = 1, init='k-means++', random_state=42)
#kmeans = KMeans(n_clusters = 4, max_iter = i+1, n_init = 1, init='random', random_state=42)
kmeans = KMeans(n_clusters = 4, max_iter = i+1, n_init = 1, init=init_array, random_state=42)
kmeans.fit(data)
kmeans_plot(data, kmeans)
print('Sum of distances to the centroids: {}'.format(kmeans.inertia_))
In [15]:
d = []
for i in range(30):
kmeans = KMeans(n_clusters = i+1, max_iter = 300, n_init = 20, init='k-means++', random_state=42)
kmeans.fit(data)
kmeans_plot(data, kmeans)
print('Sum of distances to the centers: {}'.format(kmeans.inertia_))
d.append([i+1, kmeans.inertia_])
d = np.asarray(d)
In [16]:
plt.plot(d[:,0], d[:,1])
Out[16]:
As we can see from the plot above, there is an obvious turning point at k = 4, and the cost starts to enter into a plateau. This is a simple way of selectiving k.
In [ ]: